library(readxl)
## Warning: package 'readxl' was built under R version 4.3.3
library(tseries)
## Warning: package 'tseries' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(forecast)
## Warning: package 'forecast' was built under R version 4.3.3
library(urca)
## Warning: package 'urca' was built under R version 4.3.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 4.3.3
## 
## Attaching package: 'Metrics'
## The following object is masked from 'package:forecast':
## 
##     accuracy
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(TSstudio)
## Warning: package 'TSstudio' was built under R version 4.3.3
data_revenue = read_excel("D:/Quarterly_Revenue.xlsx", col_names = FALSE)
## New names:
## • `` -> `...1`
ts_revenue = ts(data_revenue, start = c(2010, 1), frequency = 4)
time = seq_along(ts_revenue)

ts_plot(ts_revenue,
        title = "",
        Xtitle = "Năm",
        Ytitle = "Doanh thu thuần (Tỷ VND)")
#Chia tap train va val
train_revenue = ts_revenue[1:(length(ts_revenue) - 4)]
test_revenue = ts_revenue[(length(ts_revenue) - 3):length(ts_revenue)]
time_train = head(time, -4)
time_test = tail(time, 4)
ts_train = ts(train_revenue, start = c(2010, 1), frequency = 4)
#Lin-Lin
model_lin_lin = lm(ts_train ~ time_train)
summary(model_lin_lin)
## 
## Call:
## lm(formula = ts_train ~ time_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -92.733 -38.407  -1.619  37.906 117.224 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 405.9065    13.0489  31.107  < 2e-16 ***
## time_train   -2.3369     0.3983  -5.868 2.78e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 48.17 on 54 degrees of freedom
## Multiple R-squared:  0.3894, Adjusted R-squared:  0.378 
## F-statistic: 34.43 on 1 and 54 DF,  p-value: 2.776e-07
rmse(ts_train, fitted(model_lin_lin))
## [1] 47.30389
mape(ts_train, fitted(model_lin_lin))
## [1] 0.1198491
future_data = data.frame(time_train = 57:60)
forecast_lin_lin = predict(model_lin_lin, newdata = future_data)
forecast_lin_lin
##        1        2        3        4 
## 272.7006 270.3637 268.0268 265.6898
rmse(test_revenue, forecast_lin_lin)
## [1] 21.16687
mape(test_revenue, forecast_lin_lin)
## [1] 0.0638153
#Lin-Log
model_lin_log = lm(ts_train ~ log(time_train))
summary(model_lin_log)
## 
## Call:
## lm(formula = ts_train ~ log(time_train))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -113.39  -44.42   10.45   42.40  115.58 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      435.928     26.360  16.538  < 2e-16 ***
## log(time_train)  -31.395      8.229  -3.815 0.000352 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54.71 on 54 degrees of freedom
## Multiple R-squared:  0.2123, Adjusted R-squared:  0.1977 
## F-statistic: 14.56 on 1 and 54 DF,  p-value: 0.0003519
rmse(ts_train, fitted(model_lin_log))
## [1] 53.7249
mape(ts_train, fitted(model_lin_log))
## [1] 0.1438712
forecast_lin_log = predict(model_lin_log, newdata = future_data)
forecast_lin_log
##        1        2        3        4 
## 308.9974 308.4514 307.9147 307.3871
rmse(test_revenue, forecast_lin_log)
## [1] 33.25875
mape(test_revenue, forecast_lin_log)
## [1] 0.106548
#Log-Lin
model_log_lin = lm(log(ts_train) ~ time_train)
summary(model_log_lin)
## 
## Call:
## lm(formula = log(ts_train) ~ time_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.36424 -0.11519  0.01051  0.11770  0.32931 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.018104   0.040126 149.982  < 2e-16 ***
## time_train  -0.007303   0.001225  -5.963 1.95e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1481 on 54 degrees of freedom
## Multiple R-squared:  0.3971, Adjusted R-squared:  0.3859 
## F-statistic: 35.56 on 1 and 54 DF,  p-value: 1.951e-07
rmse(ts_train, exp(fitted(model_log_lin)))
## [1] 47.93705
mape(ts_train, exp(fitted(model_log_lin)))
## [1] 0.1184165
forecast_log_lin = exp(predict(model_log_lin, newdata = future_data))
forecast_log_lin
##        1        2        3        4 
## 270.9177 268.9463 266.9892 265.0464
rmse(test_revenue, forecast_log_lin)
## [1] 21.48485
mape(test_revenue, forecast_log_lin)
## [1] 0.06220432
#Log-Log
model_log_log = lm(log(ts_train) ~ log(time_train))
summary(model_log_log)
## 
## Call:
## lm(formula = log(ts_train) ~ log(time_train))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.42848 -0.12795  0.04048  0.13078  0.32437 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.11318    0.08126  75.228  < 2e-16 ***
## log(time_train) -0.09852    0.02537  -3.884 0.000283 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1687 on 54 degrees of freedom
## Multiple R-squared:  0.2183, Adjusted R-squared:  0.2039 
## F-statistic: 15.08 on 1 and 54 DF,  p-value: 0.0002827
rmse(ts_train, exp(fitted(model_log_log)))
## [1] 55.03356
mape(ts_train, exp(fitted(model_log_log)))
## [1] 0.1442767
forecast_log_log = exp(predict(model_log_log, newdata = future_data))
forecast_log_log
##        1        2        3        4 
## 303.3401 302.8208 302.3113 301.8111
rmse(test_revenue, forecast_log_log)
## [1] 28.54126
mape(test_revenue, forecast_log_log)
## [1] 0.0863775
#Dat bien gia mua vu
season_q1 = c(rep(c(1, 0, 0, 0), 15))[1:56]
season_q2 = c(rep(c(0, 1, 0, 0), 15))[1:56]
season_q3 = c(rep(c(0, 0, 1, 0), 15))[1:56]
season_q4 = c(rep(c(0, 0, 0, 1), 15))[1:56]
#Mua vu tuyen tinh dang cong
model_season_add = lm(ts_train ~ time_train + season_q2 + season_q3 + season_q4)
summary(model_season_add)
## 
## Call:
## lm(formula = ts_train ~ time_train + season_q2 + season_q3 + 
##     season_q4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -97.850 -38.062  -0.611  36.739 111.955 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 402.1532    17.2311  23.339  < 2e-16 ***
## time_train   -2.3496     0.4097  -5.735 5.32e-07 ***
## season_q2     4.4924    18.6890   0.240    0.811    
## season_q3     2.4849    18.7025   0.133    0.895    
## season_q4     9.4773    18.7249   0.506    0.615    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.43 on 51 degrees of freedom
## Multiple R-squared:  0.3927, Adjusted R-squared:  0.345 
## F-statistic: 8.243 on 4 and 51 DF,  p-value: 3.308e-05
rmse(ts_train, fitted(model_season_add))
## [1] 47.17603
mape(ts_train, fitted(model_season_add))
## [1] 0.1200855
new_season_q2 = c(0,1,0,0)
new_season_q3 = c(0,0,1,0)
new_season_q4 = c(0,0,0,1)

data_forecast_add = data.frame(
  time_train = 57:60,
  season_q2 = new_season_q2,
  season_q3 = new_season_q3,
  season_q4 = new_season_q4
)

forecast_season_add = predict(model_season_add, newdata = data_forecast_add)
forecast_season_add
##        1        2        3        4 
## 268.2266 270.3695 266.0124 270.6552
rmse(test_revenue, forecast_season_add)
## [1] 20.21466
mape(test_revenue, forecast_season_add)
## [1] 0.05712073
#Mua vu tuyen tinh dang nhan
interaction_q2 = time_train * season_q2
interaction_q3 = time_train * season_q3
interaction_q4 = time_train * season_q4

model_season_mul = lm(ts_train ~ time_train + interaction_q2 + interaction_q3 + interaction_q4)
summary(model_season_mul)
## 
## Call:
## lm(formula = ts_train ~ time_train + interaction_q2 + interaction_q3 + 
##     interaction_q4)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -103.664  -38.377   -1.459   39.962  108.973 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    406.11284   13.36904  30.377  < 2e-16 ***
## time_train      -2.38427    0.55580  -4.290 7.98e-05 ***
## interaction_q2  -0.02652    0.58496  -0.045    0.964    
## interaction_q3  -0.09690    0.57766  -0.168    0.867    
## interaction_q4   0.27076    0.57085   0.474    0.637    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 49.32 on 51 degrees of freedom
## Multiple R-squared:  0.3954, Adjusted R-squared:  0.348 
## F-statistic: 8.338 on 4 and 51 DF,  p-value: 2.966e-05
rmse(ts_train, fitted(model_season_mul))
## [1] 47.06931
mape(ts_train, fitted(model_season_mul))
## [1] 0.1184726
data_forecast_mul = data.frame(
  time_train = 57:60,
  interaction_q2 = 57:60 * new_season_q2,
  interaction_q3 = 57:60 * new_season_q3,
  interaction_q4 = 57:60 * new_season_q4
)

forecast_season_mul = predict(model_season_mul, newdata = data_forecast_mul)
forecast_season_mul
##        1        2        3        4 
## 270.2092 266.2865 259.7236 279.3023
rmse(test_revenue, forecast_season_mul)
## [1] 20.39319
mape(test_revenue, forecast_season_mul)
## [1] 0.05296969
#HW mua vu dang cong
model_hw_add = HoltWinters(ts_train, seasonal = "additive")
rmse(ts_train, fitted(model_hw_add)[,1])
## [1] 46.4913
mape(ts_train, fitted(model_hw_add)[,1])
## [1] 0.1161673
forecast_hw_add = forecast(model_hw_add, h = 4)
forecast_hw_add$mean
##          Qtr1     Qtr2     Qtr3     Qtr4
## 2024 263.0070 268.0720 267.6760 276.5073
rmse(test_revenue, forecast_hw_add$mean)
## [1] 17.84044
mape(test_revenue, forecast_hw_add$mean)
## [1] 0.05297274
#HW mua vu dang nhan
model_hw_mul = HoltWinters(ts_train, seasonal = "multiplicative")
rmse(ts_train, fitted(model_hw_mul)[,1])
## [1] 46.38345
mape(ts_train, fitted(model_hw_mul)[,1])
## [1] 0.1151952
forecast_hw_mul = forecast(model_hw_mul, h = 4)
forecast_hw_mul$mean
##          Qtr1     Qtr2     Qtr3     Qtr4
## 2024 269.2018 271.5326 269.6692 277.6537
rmse(test_revenue, forecast_hw_mul$mean)
## [1] 17.12226
mape(test_revenue, forecast_hw_mul$mean)
## [1] 0.05012387
#Chon HW dang nhan la mo hinh tot nhat du bao
model_hw_full = HoltWinters(ts_revenue, seasonal = "multiplicative")
forecast_hw_2025 = forecast(model_hw_full, h = 4)
forecast_hw_2025$mean
##          Qtr1     Qtr2     Qtr3     Qtr4
## 2025 280.9046 283.1359 289.1929 295.3345
#Chuoi gia va loi suat
data_price = read_excel("D:/VTO_PriceHistory.xlsx")
price = ts(as.numeric(data_price$Price), start = 1, frequency = 1)

plot(price, type = "l", col = "blue")

log_return = diff(log(price)) * 100
log_return = ts(log_return, start = 1, frequency = 1)

plot(log_return, type = "l", col = "red")

#ADF
summary(ur.df(price, type = "trend", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.00409 -0.10273 -0.02009  0.08399  1.00524 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.2364029  0.0781692   3.024  0.00262 **
## z.lag.1     -0.0355467  0.0116694  -3.046  0.00244 **
## tt           0.0005649  0.0001922   2.939  0.00344 **
## z.diff.lag   0.0642412  0.0448251   1.433  0.15245   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2445 on 493 degrees of freedom
## Multiple R-squared:  0.02085,    Adjusted R-squared:  0.01489 
## F-statistic: 3.499 on 3 and 493 DF,  p-value: 0.01547
## 
## 
## Value of test-statistic is: -3.0461 3.7485 4.7054 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36
summary(ur.df(price, type = "drift", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.08029 -0.10421 -0.01303  0.08552  0.95832 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.056477   0.048989   1.153    0.250
## z.lag.1     -0.004076   0.004677  -0.872    0.384
## z.diff.lag   0.048779   0.044858   1.087    0.277
## 
## Residual standard error: 0.2464 on 494 degrees of freedom
## Multiple R-squared:  0.003691,   Adjusted R-squared:  -0.0003427 
## F-statistic: 0.915 on 2 and 494 DF,  p-value: 0.4012
## 
## 
## Value of test-statistic is: -0.8715 1.2831 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79
summary(ur.df(price, type = "none", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.10291 -0.09768 -0.00872  0.09025  0.94131 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)
## z.lag.1    0.001176   0.001058   1.112    0.267
## z.diff.lag 0.046391   0.044825   1.035    0.301
## 
## Residual standard error: 0.2464 on 495 degrees of freedom
## Multiple R-squared:  0.005037,   Adjusted R-squared:  0.001017 
## F-statistic: 1.253 on 2 and 495 DF,  p-value: 0.2866
## 
## 
## Value of test-statistic is: 1.1119 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62
#ADF sai phan bac 1
summary(ur.df(diff(price), type = "trend", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07582 -0.09841 -0.00927  0.08457  0.97624 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.465e-03  2.227e-02   0.335    0.738    
## z.lag.1     -9.975e-01  6.210e-02 -16.063   <2e-16 ***
## tt           3.159e-05  7.738e-05   0.408    0.683    
## z.diff.lag   4.374e-02  4.491e-02   0.974    0.330    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2467 on 492 degrees of freedom
## Multiple R-squared:  0.4791, Adjusted R-squared:  0.4759 
## F-statistic: 150.8 on 3 and 492 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -16.0628 86.0108 129.015 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36
summary(ur.df(diff(price), type = "drift", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.07192 -0.10156 -0.01263  0.08476  0.97982 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.01534    0.01111   1.380    0.168    
## z.lag.1     -0.99705    0.06204 -16.072   <2e-16 ***
## z.diff.lag   0.04361    0.04487   0.972    0.332    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2465 on 493 degrees of freedom
## Multiple R-squared:  0.4789, Adjusted R-squared:  0.4768 
## F-statistic: 226.5 on 2 and 493 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -16.0717 129.1513 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79
summary(ur.df(diff(price), type = "none", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.06190 -0.08668  0.00307  0.10000  0.98850 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    -0.98943    0.06185 -15.998   <2e-16 ***
## z.diff.lag  0.03974    0.04482   0.887    0.376    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2467 on 494 degrees of freedom
## Multiple R-squared:  0.4769, Adjusted R-squared:  0.4748 
## F-statistic: 225.2 on 2 and 494 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -15.9978 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62
#Kiem tra bac AR,MA
Acf(diff(price))

Pacf(diff(price))

model_arima_5_13 = arima(price, c(5, 1, 13))
summary(model_arima_5_13)
## 
## Call:
## arima(x = price, order = c(5, 1, 13))
## 
## Coefficients:
##          ar1      ar2      ar3     ar4      ar5      ma1     ma2     ma3
##       0.4001  -0.3227  -0.1816  0.2864  -0.8311  -0.3567  0.2578  0.1894
## s.e.  0.1341   0.1760   0.2064  0.1690   0.1206   0.1405  0.1742  0.1963
##           ma4     ma5     ma6      ma7     ma8      ma9     ma10    ma11
##       -0.2968  0.6297  0.1674  -0.1924  0.0026  -0.0083  -0.1977  0.0334
## s.e.   0.1603  0.1154  0.0613   0.0706  0.0798   0.0709   0.0699  0.0555
##          ma12    ma13
##       -0.0399  0.1090
## s.e.   0.0518  0.0508
## 
## sigma^2 estimated as 0.05444:  log likelihood = 13.62,  aic = 10.77
## 
## Training set error measures:
##                      ME      RMSE     MAE       MPE     MAPE    MASE
## Training set 0.02131319 0.2330877 0.15424 0.1855524 1.439599 1.00028
##                     ACF1
## Training set -0.01195235
autoplot(model_arima_5_13)

checkresiduals(model_arima_5_13)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,1,13)
## Q* = 8.0318, df = 3, p-value = 0.04536
## 
## Model df: 18.   Total lags used: 21
model_arima_5_5 = arima(price, c(5, 1, 5))
summary(model_arima_5_5)
## 
## Call:
## arima(x = price, order = c(5, 1, 5))
## 
## Coefficients:
##           ar1     ar2      ar3      ar4     ar5     ma1      ma2     ma3
##       -0.0932  0.2145  -0.1939  -0.0292  0.1493  0.1511  -0.2674  0.1483
## s.e.   0.2087  0.2406   0.1913   0.2298  0.1835  0.1981   0.2354  0.1870
##          ma4      ma5
##       0.0305  -0.3767
## s.e.  0.2285   0.1755
## 
## sigma^2 estimated as 0.05741:  log likelihood = 4.7,  aic = 12.59
## 
## Training set error measures:
##                      ME      RMSE       MAE       MPE     MAPE    MASE
## Training set 0.02311008 0.2393676 0.1551251 0.2014103 1.444021 1.00602
##                      ACF1
## Training set -0.007405784
autoplot(model_arima_5_5)

checkresiduals(model_arima_5_5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,1,5)
## Q* = 4.0782, df = 3, p-value = 0.2531
## 
## Model df: 10.   Total lags used: 13
model_arima_13_5 = arima(price, c(13, 1, 5))
summary(model_arima_13_5)
## 
## Call:
## arima(x = price, order = c(13, 1, 5))
## 
## Coefficients:
##          ar1      ar2      ar3      ar4      ar5     ar6      ar7     ar8
##       0.1443  -0.2147  -0.1314  -0.0451  -0.7586  0.0782  -0.1469  0.0219
## s.e.  0.2214   0.1892   0.1623   0.2239   0.1471  0.0779   0.0690  0.0691
##           ar9     ar10    ar11    ar12    ar13      ma1     ma2     ma3     ma4
##       -0.0816  -0.1398  0.0177  -0.087  0.1525  -0.0988  0.1520  0.1265  0.0431
## s.e.   0.0695   0.0604  0.0510   0.050  0.0532   0.2231  0.1857  0.1610  0.2230
##          ma5
##       0.5531
## s.e.  0.1395
## 
## sigma^2 estimated as 0.0555:  log likelihood = 12.86,  aic = 12.27
## 
## Training set error measures:
##                      ME      RMSE       MAE       MPE     MAPE     MASE
## Training set 0.02076937 0.2353462 0.1559738 0.1801014 1.454261 1.011524
##                      ACF1
## Training set -0.004320014
autoplot(model_arima_13_5)

checkresiduals(model_arima_13_5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(13,1,5)
## Q* = 6.3139, df = 3, p-value = 0.0973
## 
## Model df: 18.   Total lags used: 21
#So sanh voi thuc te
fc_price = forecast(model_arima_5_5, h = 10)$mean
price_real = c(14.8, 14.45, 13.9, 13.8, 14.15, 14.05, 13.8, 13.85, 13.8, 14.0)

rmse(price_real, fc_price)
## [1] 0.801461
mape(price_real, fc_price)
## [1] 0.05322225
df_plot_price = data.frame(tt = seq_along(price_real), price_real, fc_price)

ggplot(df_plot_price, aes(x = tt)) +
  geom_line(aes(y = price_real, color = "Giá thực tế"), linewidth = 2) +
  geom_line(aes(y = fc_price, color = "Giá dự báo"), linewidth = 2) +
  labs(title = "So sánh giá cổ phiếu dự báo và thực tế", x = "Time", y = "Price") +
  scale_color_manual(values = c("Giá thực tế" = "blue", "Giá dự báo" = "red")) +
  theme_minimal()

#ADF chuoi loi suat
summary(ur.df(log_return, type = "trend", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5193 -1.0200 -0.1129  0.9192  6.7045 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.1260676  0.1917304   0.658    0.511    
## z.lag.1     -1.0235062  0.0618330 -16.553   <2e-16 ***
## tt           0.0001022  0.0006656   0.154    0.878    
## z.diff.lag   0.0655977  0.0445370   1.473    0.141    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.122 on 492 degrees of freedom
## Multiple R-squared:  0.4832, Adjusted R-squared:  0.4801 
## F-statistic: 153.4 on 3 and 492 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -16.5527 91.3475 137.0164 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36
summary(ur.df(log_return, type = "drift", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.5049 -1.0182 -0.1156  0.9204  6.7175 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.15158    0.09569   1.584    0.114    
## z.lag.1     -1.02353    0.06177 -16.570   <2e-16 ***
## z.diff.lag   0.06566    0.04449   1.476    0.141    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.12 on 493 degrees of freedom
## Multiple R-squared:  0.4832, Adjusted R-squared:  0.4811 
## F-statistic: 230.5 on 2 and 493 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -16.5696 137.2814 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79
summary(ur.df(log_return, type = "none", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.3253 -0.8714  0.0410  1.0683  6.8047 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    -1.01360    0.06155 -16.469   <2e-16 ***
## z.diff.lag  0.06055    0.04444   1.362    0.174    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.123 on 494 degrees of freedom
## Multiple R-squared:  0.4806, Adjusted R-squared:  0.4785 
## F-statistic: 228.5 on 2 and 494 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -16.4689 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.58 -1.95 -1.62
#Kiem tra bac AR, MA
Acf(log_return)

Pacf(log_return)

model_ls_5_5 = arima(log_return, c(5, 0, 5))
## Warning in arima(log_return, c(5, 0, 5)): possible convergence problem: optim
## gave code = 1
summary(model_ls_5_5)
## 
## Call:
## arima(x = log_return, order = c(5, 0, 5))
## 
## Coefficients:
##          ar1     ar2      ar3     ar4     ar5     ma1      ma2     ma3      ma4
##       0.0169  0.3053  -0.2824  0.0999  0.1873  0.0320  -0.3961  0.2187  -0.0664
## s.e.  0.4458  0.3576   0.6190  0.2565  0.3401  0.4376   0.3435  0.6454   0.2598
##           ma5  intercept
##       -0.3636     0.1628
## s.e.   0.3854     0.0595
## 
## sigma^2 estimated as 4.366:  log likelihood = -1073.74,  aic = 2171.48
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE         ACF1
## Training set 0.003570057 2.089456 1.433722 NaN  Inf 0.6801143 0.0004672621
autoplot(model_ls_5_5)

checkresiduals(model_ls_5_5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,5) with non-zero mean
## Q* = 2.8165, df = 3, p-value = 0.4208
## 
## Model df: 10.   Total lags used: 13
model_ls_18_5 = arima(log_return, c(18, 0, 5))
summary(model_ls_18_5)
## 
## Call:
## arima(x = log_return, order = c(18, 0, 5))
## 
## Coefficients:
##          ar1      ar2     ar3     ar4      ar5     ar6      ar7     ar8
##       0.5553  -0.3288  0.1875  0.4344  -0.8178  0.1805  -0.1834  0.0672
## s.e.  0.1725   0.1881  0.1911  0.1947   0.1525  0.0752   0.0759  0.0840
##           ar9     ar10     ar11     ar12    ar13     ar14    ar15     ar16
##       -0.0473  -0.0635  -0.0019  -0.0865  0.1265  -0.1022  0.0270  -0.0747
## s.e.   0.0856   0.0834   0.0778   0.0735  0.0715   0.0679  0.0656   0.0629
##          ar17     ar18      ma1     ma2      ma3      ma4     ma5  intercept
##       -0.0483  -0.0481  -0.5128  0.2334  -0.1905  -0.4630  0.6710     0.1634
## s.e.   0.0608   0.0576   0.1681  0.1802   0.1717   0.1761  0.1385     0.0553
## 
## sigma^2 estimated as 4.126:  log likelihood = -1062.17,  aic = 2174.34
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE          ACF1
## Training set 0.003655175 2.031373 1.429586 NaN  Inf 0.6781525 -0.0008185149
autoplot(model_ls_18_5)

checkresiduals(model_ls_18_5)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(18,0,5) with non-zero mean
## Q* = 5.0913, df = 3, p-value = 0.1652
## 
## Model df: 23.   Total lags used: 26
model_ls_5_18 = arima(log_return, c(5, 0, 18))
summary(model_ls_5_18)
## 
## Call:
## arima(x = log_return, order = c(5, 0, 18))
## 
## Coefficients:
##          ar1      ar2      ar3     ar4      ar5      ma1     ma2     ma3
##       0.4585  -0.1509  -0.0217  0.4310  -0.7173  -0.4125  0.0653  0.0077
## s.e.  0.1700   0.1478   0.1446  0.1343   0.1417   0.1760  0.1536  0.1414
##           ma4     ma5     ma6      ma7     ma8      ma9     ma10     ma11
##       -0.4360  0.5706  0.1536  -0.1336  0.0075  -0.0057  -0.0704  -0.0018
## s.e.   0.1326  0.1484  0.0634   0.0622  0.0646   0.0640   0.0638   0.0623
##          ma12    ma13     ma14     ma15     ma16     ma17     ma18  intercept
##       -0.0412  0.0992  -0.0508  -0.0172  -0.0319  -0.0253  -0.0641     0.1618
## s.e.   0.0624  0.0669   0.0684   0.0543   0.0525   0.0615   0.0715     0.0567
## 
## sigma^2 estimated as 4.158:  log likelihood = -1063.98,  aic = 2177.96
## 
## Training set error measures:
##                      ME     RMSE      MAE MPE MAPE      MASE         ACF1
## Training set 0.00411781 2.039021 1.429853 NaN  Inf 0.6782792 -0.002195243
autoplot(model_ls_5_18)

checkresiduals(model_ls_5_18)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,18) with non-zero mean
## Q* = 6.147, df = 3, p-value = 0.1047
## 
## Model df: 23.   Total lags used: 26
model_ls_auto = auto.arima(log_return)
summary(model_ls_auto)
## Series: log_return 
## ARIMA(5,0,0) with non-zero mean 
## 
## Coefficients:
##          ar1      ar2      ar3     ar4      ar5    mean
##       0.0460  -0.0701  -0.0401  0.0075  -0.1583  0.1647
## s.e.  0.0446   0.0446   0.0451  0.0450   0.0450  0.0777
## 
## sigma^2 = 4.481:  log likelihood = -1077.16
## AIC=2168.33   AICc=2168.56   BIC=2197.8
## 
## Training set error measures:
##                       ME     RMSE      MAE MPE MAPE      MASE        ACF1
## Training set 0.001733249 2.104128 1.449586 NaN  Inf 0.6876398 0.002258355
autoplot(model_ls_auto)

checkresiduals(model_ls_auto)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(5,0,0) with non-zero mean
## Q* = 6.0777, df = 5, p-value = 0.2987
## 
## Model df: 5.   Total lags used: 10
#So sanh voi loi suat thuc te
log_return_real = c(0.677968699, -2.393276621, -3.880557442, -0.722024797, 2.504603193,
                    -0.709222831, -1.795380362, 0.361664047, -0.361664047, 1.438873745)

fc_log_return = forecast(model_ls_5_5, h = 10)$mean

rmse(log_return_real, fc_log_return)
## [1] 1.944191
smape(log_return_real, fc_log_return)
## [1] 1.44142
df_plot_return = data.frame(tl = seq_along(log_return_real), log_return_real, fc_log_return)

ggplot(df_plot_return, aes(x = tl)) +
  geom_line(aes(y = log_return_real, color = "Lợi suất thực tế"), linewidth = 1) +
  geom_line(aes(y = fc_log_return, color = "Lợi suất dự báo"), linewidth = 1) +
  labs(title = "So sánh chuỗi log return dự báo và thực tế", x = "Time", y = "Log return") +
  scale_color_manual(values = c("Lợi suất thực tế" = "blue", "Lợi suất dự báo" = "red")) +
  theme_minimal()